# Clean Environment -------------------------------------------------------
rm(list = ls())

# Load and Filter Data ---------------------------------------------------
df <- readRDS("df_full_cjoint.rds") %>%
  filter(ResponseId != "respondent_2592" & ResponseId != "respondent_0084") %>% # remove obs with no variance
  mutate(ResponseId = as.factor(ResponseId))

df_covariates <- readRDS("df_full_uod.rds") %>%
  filter(ResponseId != "respondent_2592" & ResponseId != "respondent_0084") %>% # remove obs with no variance
  mutate(ResponseId = as.factor(ResponseId))

n <- length(unique(df$ResponseId))
respondent_list <- unique(df$ResponseId)

# Transform Character Variables to Factors -------------------------------
cols <- c("attr_gender", "attr_party", "attr_tax", "attr_abortion", "attr_media", "attr_judges", "attr_age_fac")
df[cols] <- lapply(df[cols], as.factor)

# NAIVE OLS ---------------------------------------------------------------
# Relevel factors
df <- within(df, attr_judges <- factor(attr_judges, levels = c("judges_L","judges_A","judges_M")))
df <- within(df, attr_media <- factor(attr_media, levels = c("media_L","media_A","media_M")))

# Adjust matrix & run IMCEs
coef <- matrix(NA, nrow = n, ncol = 6)
se <- matrix(NA, nrow = n, ncol = 6)
p <- matrix(NA, nrow = n, ncol = 4)

for (i in 1:n) {
  mod <- lm(rating0 ~ attr_judges, data = df, subset = (ResponseId == respondent_list[i]))
  summ <- summary(mod)
  coef[i, 1] <- summ$coefficients[1, 1]
  coef[i, 2] <- summ$coefficients[2, 1]
  coef[i, 3] <- summ$coefficients[3, 1]
  p[i, 1] <- summ$coefficients[2, 4]
  p[i, 2] <- summ$coefficients[3, 4]
  
  mod <- lm(rating0 ~ attr_media, data = df, subset = (ResponseId == respondent_list[i]))
  summ <- summary(mod)
  coef[i, 4] <- summ$coefficients[1, 1]
  coef[i, 5] <- summ$coefficients[2, 1]
  coef[i, 6] <- summ$coefficients[3, 1]
  p[i, 3] <- summ$coefficients[2, 4]
  p[i, 4] <- summ$coefficients[3, 4]
}

# Merge IMCE, SE, ResponseId
coef.exp <- data.frame(cbind(coef, p))
coef.exp <- coef.exp %>% add_column(df_covariates$ResponseId, .before = 1)

colnames(coef.exp) <- c(
  'ResponseId', 'judges_L', 'judges_A', 'judges_M', 'media_L', 'media_A', 'media_M',
  'p_judges_A', 'p_judges_M', 'p_media_A', 'p_media_M'
)

OLS_naive <- coef.exp %>%
  dplyr::select(ResponseId, judges_M, judges_A, media_M, media_A) %>%
  dplyr::rename(
    judges_M_naive = judges_M,
    judges_A_naive = judges_A,
    media_M_naive = media_M,
    media_A_naive = media_A
  )

# BOOTSTRAP OLS -----------------------------------------------------------
# Draw plausible IMCE values (Nonparametric bootstrap)
b.attr_judges_M <- matrix(0, nrow = 100, ncol = 1)
b.attr_judges_A <- matrix(0, nrow = 100, ncol = 1)
b.attr_media_M <- matrix(0, nrow = 100, ncol = 1)
b.attr_media_A <- matrix(0, nrow = 100, ncol = 1)
draws.data <- NULL

for (i in 1:n) {
  set.seed(211)
  data.i <- df[which(df$ResponseId == levels(as.factor(df$ResponseId))[i]), ]
  
  # Judges_M bootstrap
  b <- 1
  while (b < 101) {
    data.b <- data.i[sample(nrow(data.i), 24, replace = TRUE), ]
    
    # Check if attr_judges_M has at least 2 levels
    if (length(unique(data.b$attr_judges_M)) > 1) {
      mod <- lm(rating0 ~ attr_judges_M, data = data.b)
      if (!is.na(mod$coefficients[2])) {
        b.attr_judges_M[b] <- mod$coefficients[2]
        b <- b + 1
      }
    }
  }
  
  # Judges_A bootstrap
  b <- 1
  while (b < 101) {
    data.b <- data.i[sample(nrow(data.i), 24, replace = TRUE), ]
    
    # Check if attr_judges_A has at least 2 levels
    if (length(unique(data.b$attr_judges_A)) > 1) {
      mod <- lm(rating0 ~ attr_judges_A, data = data.b)
      if (!is.na(mod$coefficients[2])) {
        b.attr_judges_A[b] <- mod$coefficients[2]
        b <- b + 1
      }
    }
  }
  
  # Media_M bootstrap
  b <- 1
  while (b < 101) {
    data.b <- data.i[sample(nrow(data.i), 24, replace = TRUE), ]
    
    # Check if attr_media_M has at least 2 levels
    if (length(unique(data.b$attr_media_M)) > 1) {
      mod <- lm(rating0 ~ attr_media_M, data = data.b)
      if (sum(is.na(mod$coefficients)) == 0) {
        b.attr_media_M[b] <- mod$coefficients[2]
        b <- b + 1
      }
    }
  }
  
  # Media_A bootstrap
  b <- 1
  while (b < 101) {
    data.b <- data.i[sample(nrow(data.i), 24, replace = TRUE), ]
    
    # Check if attr_media_A has at least 2 levels
    if (length(unique(data.b$attr_media_A)) > 1) {
      mod <- lm(rating0 ~ attr_media_A, data = data.b)
      if (!is.na(mod$coefficients[2])) {
        b.attr_media_A[b] <- mod$coefficients[2]
        b <- b + 1
      }
    }
  }
  
  # Compile results into a temporary data frame
  temp <- cbind.data.frame(
    levels(df$ResponseId)[i],
    1:100,
    b.attr_judges_M,
    b.attr_judges_A,
    b.attr_media_M,
    b.attr_media_A
  )
  
  # Prepend the zero row
  temp <- rbind.data.frame(
    c(levels(df$ResponseId)[i], rep(0, times = 5)), temp
  )
  
  # Append to draws.data
  draws.data <- rbind.data.frame(draws.data, temp)
}

# Export drawn values to CSV
colnames(draws.data) <- c(
  'ResponseId', 'impno', 'imce_attr_judges_M', 'imce_attr_judges_A',
  'imce_attr_media_M', 'imce_attr_media_A'
)

OLS_boot <- draws.data %>%
  filter(impno != 0) %>%
  group_by(ResponseId) %>%
  summarise(
    judges_M_boot = mean(as.numeric(imce_attr_judges_M)),
    judges_A_boot = mean(as.numeric(imce_attr_judges_A)),
    media_M_boot = mean(as.numeric(imce_attr_media_M)),
    media_A_boot = mean(as.numeric(imce_attr_media_A))
  )

# Combine Naive and Bootstrap Estimates -----------------------------------
IMCE_full <- OLS_naive %>%
  left_join(OLS_boot, by = "ResponseId")

write_rds(IMCE_full, file = "IMCEs.rds")

# Robinson/Duch Random Forest ---------------------------------------------

# Binary choice
df_subset <- df %>%
  dplyr::select(
    ResponseId, attr_media, attr_judges, choice, chosen, lib, maj, auth,
    party_preference, income, education, financial_situation, gender, age
  ) %>%
  drop_na(
    ResponseId, attr_media, attr_judges, choice, chosen, lib, maj, auth,
    party_preference, income, education, financial_situation, gender, age
  )

set.seed(211)

cj_model <- cjbart(
  data = df_subset,
  Y = "chosen",
  id = "ResponseId",
  round = "choice"
  )

IMCE_model <- IMCE(
  df_subset,
  cj_model,
  attribs = c("attr_judges", "attr_media"),
  ref_levels = c("judges_L", "media_L"),
  method = 'rubin'
  )

vimp_estimates <- het_vimp(imces = IMCE_model)

save(cj_model, IMCE_model, vimp_estimates, file = "cjbart_choice.RData")

# Rating
df_subset <- df %>%
  dplyr::select(
    ResponseId, attr_media, attr_judges, choice, rating0, lib, maj, auth,
    party_preference, income, education, financial_situation, gender, age
  ) %>%
  drop_na(
    ResponseId, attr_media, attr_judges, choice, rating0, lib, maj, auth,
    party_preference, income, education, financial_situation, gender, age
  )

set.seed(211)

cj_model_rating <- cjbart(
  data = df_subset,
  Y = "rating0",
  id = "ResponseId",
  round = "choice")

IMCE_model_rating <- IMCE(
  df_subset,
  cj_model_rating,
  attribs = c("attr_judges", "attr_media"),
  ref_levels = c("judges_L", "media_L"),
  method = 'rubin'
)

vimp_estimates_rating <- het_vimp(imces = IMCE_model_rating)

save(cj_model_rating, IMCE_model_rating, vimp_estimates_rating, file = "cjbart_rating.RData")
